30_machine_learning_model

Task: Predict the total_power that will be drained from the wallboxes on the next day using data of the last 14 days.

Import Data

The data is already preprocessed. I just removed one additional feature we used for plotting in another task.

wallboxes_jan_aug_DT <- fread("./data/preprocessed/total_power_jan-aug.csv")

wallboxes_jan_aug_DT[, battery_SOC := NULL]

Feature Engineering

For this task I performed some feature engineering where I tried to enrich the data I have so far. I started off with two columns, namely Date and total_power. Finally, I came up with 4 new features:

  • total_power_previous_day: As the name already suggests I added a column containing the total_power that was used on the previous day.
  • total_power_same_day_previous_week: Again very simple I added a column containing the total_power that was used on the same day last week.
  • average_total_power_last_seven_days: For this I calculated the average total_power that was used during the last seven days.
  • is_weekend: This column is set to 1 if the corresponding Date was a weekend (saturday, sunday; in my case “Samstag”, “Sonntag” because of german language) or 0 if it was a normal weekday.
# 1. total_power used on previous day
wallboxes_jan_aug_DT[, total_power_previous_day := shift(total_power, 1)]

# 2. total_power used on the same day last week
wallboxes_jan_aug_DT[, total_power_same_day_previous_week := shift(total_power, 7)]

# 3. average total_power that was used in the last 7 days (rolling average)
wallboxes_jan_aug_DT <- wallboxes_jan_aug_DT %>%
                          mutate(average_total_power_last_seven_days = rollmean(total_power, k = 7, fill = NA, align = "right"))
                                 
# I found out that this calculation sums the last X values and divides it by X and then writes it to row X, this doesn't
# make sense because when we want to predict the total_power that will be used on a day then we of course don't know the
# total_power of this day already. Therefore all of the solutions have to be shifted by 1. So that for example the rolling average
# of the first 7 days is written in row 8. Meaning we can use this knowledge to predict total_power on day 8.

wallboxes_jan_aug_DT[, average_total_power_last_seven_days := shift(average_total_power_last_seven_days, 1)]

# 4. weekend or not
wallboxes_jan_aug_DT[, is_weekend := weekdays(Date)]
wallboxes_jan_aug_DT$is_weekend <- ifelse(wallboxes_jan_aug_DT$is_weekend %in% c("Samstag", "Sonntag"), 1, 0)

Let’s have a look at the data so far.

str(wallboxes_jan_aug_DT)
## Classes 'data.table' and 'data.frame':   233 obs. of  6 variables:
##  $ Date                               : IDate, format: "2022-01-01" "2022-01-02" ...
##  $ total_power                        : num  51.9 73.4 76.3 91 216.9 ...
##  $ total_power_previous_day           : num  NA 51.9 73.4 76.3 91 ...
##  $ total_power_same_day_previous_week : num  NA NA NA NA NA ...
##  $ average_total_power_last_seven_days: num  NA NA NA NA NA ...
##  $ is_weekend                         : num  1 1 0 0 0 0 0 1 1 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(wallboxes_jan_aug_DT)
##       Date             total_power     total_power_previous_day
##  Min.   :2022-01-01   Min.   : 14.26   Min.   : 14.26          
##  1st Qu.:2022-02-28   1st Qu.:105.04   1st Qu.:104.95          
##  Median :2022-04-27   Median :160.65   Median :160.40          
##  Mean   :2022-04-27   Mean   :166.96   Mean   :166.78          
##  3rd Qu.:2022-06-24   3rd Qu.:214.96   3rd Qu.:215.45          
##  Max.   :2022-08-21   Max.   :468.21   Max.   :468.21          
##                                        NA's   :1               
##  total_power_same_day_previous_week average_total_power_last_seven_days
##  Min.   : 14.26                     Min.   :109.2                      
##  1st Qu.:103.70                     1st Qu.:145.9                      
##  Median :158.59                     Median :164.7                      
##  Mean   :165.72                     Mean   :167.7                      
##  3rd Qu.:213.08                     3rd Qu.:187.2                      
##  Max.   :468.21                     Max.   :243.9                      
##  NA's   :7                          NA's   :7                          
##    is_weekend    
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.2918  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
## 
head(wallboxes_jan_aug_DT, n = 15)
##           Date total_power total_power_previous_day
##  1: 2022-01-01    51.94718                       NA
##  2: 2022-01-02    73.40652                 51.94718
##  3: 2022-01-03    76.32785                 73.40652
##  4: 2022-01-04    90.97021                 76.32785
##  5: 2022-01-05   216.92926                 90.97021
##  6: 2022-01-06   103.36748                216.92926
##  7: 2022-01-07   235.61880                103.36748
##  8: 2022-01-08   186.28714                235.61880
##  9: 2022-01-09    92.06895                186.28714
## 10: 2022-01-10   437.82018                 92.06895
## 11: 2022-01-11    41.78920                437.82018
## 12: 2022-01-12   194.65295                 41.78920
## 13: 2022-01-13    72.31517                194.65295
## 14: 2022-01-14   208.54007                 72.31517
## 15: 2022-01-15   208.51944                208.54007
##     total_power_same_day_previous_week average_total_power_last_seven_days
##  1:                                 NA                                  NA
##  2:                                 NA                                  NA
##  3:                                 NA                                  NA
##  4:                                 NA                                  NA
##  5:                                 NA                                  NA
##  6:                                 NA                                  NA
##  7:                                 NA                                  NA
##  8:                           51.94718                            121.2239
##  9:                           73.40652                            140.4153
## 10:                           76.32785                            143.0814
## 11:                           90.97021                            194.7231
## 12:                          216.92926                            187.6973
## 13:                          103.36748                            184.5150
## 14:                          235.61880                            180.0789
## 15:                          186.28714                            176.2105
##     is_weekend
##  1:          1
##  2:          1
##  3:          0
##  4:          0
##  5:          0
##  6:          0
##  7:          0
##  8:          1
##  9:          1
## 10:          0
## 11:          0
## 12:          0
## 13:          0
## 14:          0
## 15:          1

As you can see I now have some NA values because I shifted the data around. Let’s remove observations containing NAs in one of their columns.

# colSums(is.na(wallboxes_jan_aug_DT))
wallboxes_jan_aug_DT <- na.omit(wallboxes_jan_aug_DT)
# str(wallboxes_jan_aug_DT)
# summary(wallboxes_jan_aug_DT)
head(wallboxes_jan_aug_DT, n = 15)
##           Date total_power total_power_previous_day
##  1: 2022-01-08   186.28714                235.61880
##  2: 2022-01-09    92.06895                186.28714
##  3: 2022-01-10   437.82018                 92.06895
##  4: 2022-01-11    41.78920                437.82018
##  5: 2022-01-12   194.65295                 41.78920
##  6: 2022-01-13    72.31517                194.65295
##  7: 2022-01-14   208.54007                 72.31517
##  8: 2022-01-15   208.51944                208.54007
##  9: 2022-01-16    33.40852                208.51944
## 10: 2022-01-17   121.15128                 33.40852
## 11: 2022-01-18   106.95493                121.15128
## 12: 2022-01-19   194.38071                106.95493
## 13: 2022-01-20    86.13421                194.38071
## 14: 2022-01-21   201.98699                 86.13421
## 15: 2022-01-22   101.58306                201.98699
##     total_power_same_day_previous_week average_total_power_last_seven_days
##  1:                           51.94718                            121.2239
##  2:                           73.40652                            140.4153
##  3:                           76.32785                            143.0814
##  4:                           90.97021                            194.7231
##  5:                          216.92926                            187.6973
##  6:                          103.36748                            184.5150
##  7:                          235.61880                            180.0789
##  8:                          186.28714                            176.2105
##  9:                           92.06895                            179.3866
## 10:                          437.82018                            171.0065
## 11:                           41.78920                            125.7681
## 12:                          194.65295                            135.0775
## 13:                           72.31517                            135.0386
## 14:                          208.54007                            137.0127
## 15:                          208.51944                            136.0766
##     is_weekend
##  1:          1
##  2:          1
##  3:          0
##  4:          0
##  5:          0
##  6:          0
##  7:          0
##  8:          1
##  9:          1
## 10:          0
## 11:          0
## 12:          0
## 13:          0
## 14:          0
## 15:          1

Random Forest Models

Finally let’s train multiple random forest models and always try predicting the total_power that will be used on the next day using the data of the last 14 days. After that I calculate the average Mean Absolute Error (MAE) of all models and the average Mean Absolute Percentage Error (MAPE) of all models so that I have some metrics I can use for evaluating the models.

MAPE <- function(predicted, actual){
  mape <- mean(abs((actual - predicted)/actual))*100
  return (mape)
}

actuals <- wallboxes_jan_aug_DT
start_date <- as.Date("2022-01-08")
end_date <- as.Date("2022-01-22")
predicted_date <- list() # the date I tried to predict will get stored in this list
predictions <- list() # the predictions will get stored in this list
real_values <- list() # the actual values will get stored in this list
errors <- list() # the MAE for each model will get stored in this list
mape_errors <- list() # the MAPE for each model will get stored in this list

i <- 1
while (end_date <= as.Date("2022-08-21")) {
  
  # print(paste('Currently calculating prediction for', end_date))
  
  # use only the last 14 days to train a RF model
  training <- actuals[Date >= start_date & Date < end_date,]
  test <- actuals[actuals$Date == end_date]
  
  rf_model <- train(total_power ~ . - Date, 
                    data = training, 
                    method = "ranger")
  
  pred <- predict(rf_model, newdata = test) # predict 1 day
  predictions[[i]] <- pred # store the prediction
  predicted_date[[i]] <- end_date
  actual <- test$total_power
  real_values[[i]] <- actual
  errors[[i]] <- abs(actual - pred) # store the difference between actual and predicted value
  mape_errors[[i]] <- MAPE(pred, actual)
  
  i <- i+1
  
  start_date <- start_date + 1
  end_date <- end_date + 1
}

mean_absolute_error <- mean(unlist(errors))
mean_absolute_error
## [1] 69.67681
mean_absolute_percentage_error <- mean(unlist(mape_errors))
mean_absolute_percentage_error
## [1] 70.3704

Let’s also look at a plot to better visualize the results. The first plot uses lines+markers to better identify the single points, the second one only uses lines.

# lines+markers
plot_ly() %>%
  add_trace(x = ~predicted_date, y = ~real_values, name = 'Actuals', type = 'scatter', mode = 'lines+markers') %>%
  add_trace(x = ~predicted_date, y = ~predictions, name = 'Predictions', type = 'scatter', mode = 'lines+markers') %>%
  layout(title = 'RF predictions using the last 14 days', xaxis = list(title="Date"), yaxis =list(title="kWh"))
# only lines
plot_ly() %>%
  add_trace(x = ~predicted_date, y = ~real_values, name = 'Actuals', type = 'scatter', mode = 'lines') %>%
  add_trace(x = ~predicted_date, y = ~predictions, name = 'Predictions', type = 'scatter', mode = 'lines') %>%
  layout(title = 'RF predictions using the last 14 days', xaxis = list(title="Date"), yaxis =list(title="kWh"))